#define vec2 float2
#define vec3 float3
#define vec4 float4
#define rgb xyz
#define rgba xyzw
#define PI 3.1415926535897932f

#define motionBlur 0.2f
#define radialBlur 0.9f
#define rotateblur 0.1f
// Designed by: RuanShengQiang 

const __constant float ScaleXUp = 0.0f; 
const __constant float ScaleXDown = 0.0f; 
const __constant float ScaleYUp = 0.0f; 
const __constant float ScaleYDown = 0.0f; 

const __constant int InOutStage = 3;

const __constant float amplitude = 0.5f;//PREFIX(amplitude);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE |CLK_FILTER_NEAREST;

static vec2 rotateFunc(vec2 uv, vec2 center, float theta)
{
	vec2 temp;
	temp.x = dot((vec2)(cos(theta), -sin(theta)), uv - center);
	temp.y = dot((vec2)(sin(theta), cos(theta)), uv - center);
	return temp + center;
}

static vec2 scaleFunc(vec2 uv, vec2 scale,vec2 center)
{

	return (uv - center)/(scale) + center ;

}

static vec2 myMod(vec2 x, float y)
{
	return x-y * floor (x/y);
}


static vec2 mirror(vec2 uv)
{
	vec2 parity = myMod(floor(uv),2.0f);
	vec2 sign = -2.0f*parity + 1.0f;
	return myMod( 2.0f*parity+sign*myMod(uv,1.0f),1.0f);
}

static float bash(vec2 co){
	float input1 = sin(dot(co,(vec2)(12.9898f,78.233f)))*43758.5453123f;
	float temp; 
    return fract(input1, &temp);
}

static vec2 rand(float frequency,float amplitude,float tIn,float offset)
{
	vec2 temp; 
	vec2 tempPre;
	float t = tIn - offset;
	float a = t * frequency;
	temp.x = sin(a);
	temp.x += sin(a*2.1f)*1.828f;
	temp.x += sin(a*1.72f)*4.0f;
	temp.x += sin(t*frequency*2.221f)*3.488f;
	temp.x += sin(t*frequency*3.1122f)*2.5f;
	temp.x *= amplitude*0.06f;

	temp.y = frequency*cos(t * frequency);
	temp.y += cos(t*frequency*2.1f)*1.828f*frequency*2.1f;
	temp.y += cos(t*frequency*1.72f)*4.0f*frequency*1.72f;
	temp.y += cos(t*frequency*2.221f)*3.488f*frequency*2.221f;
	temp.y += cos(t*frequency*3.1122f)*2.5f*frequency*3.1122f;

	temp.y *= amplitude*0.06f;
	
	t = tIn - offset - 0.02f;
	tempPre.y = frequency*cos(t * frequency);
	tempPre.y += cos(t*frequency*2.1f)*1.828f*frequency*2.1f;
	tempPre.y += cos(t*frequency*1.72f)*4.0f*frequency*1.72f;
	tempPre.y += cos(t*frequency*2.221f)*3.488f*frequency*2.221f;
	tempPre.y += cos(t*frequency*3.1122f)*2.5f*frequency*3.1122f;
	tempPre.y *= amplitude*0.06f;
	
	temp.y = temp.y*0.75f + tempPre.y*0.25f;
	
	return temp;  

}

static vec4 INPUT(image2d_t src_data, vec2 tc, __global FilterParam* param)
{

	tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, (vec2)(tc.x, tc.y) ).zyxw;
}

vec2 decay(vec2 vel, float t, int InOutStage)
{
	vec2 temp = vel; 
	
	if(1  == InOutStage)
	{
		float a = -0.7f*t-0.3f;
		float tempT = a*a*a;
		float fade = tempT*tempT;
		float dFade = 6.0f*tempT;
		temp.x =  temp.x*fade;
		temp.y = dFade*temp.x + fade*temp.y; 
	}else if (2  == InOutStage)
	{
		float a = t*0.7f-1.0f;
		float tempT = a*a*a;
		float fade = tempT*tempT;
		float dFade = 6.0f*tempT;
		temp.x = temp.x*fade;
		temp.y = -dFade*temp.x + fade*temp.y; 
	}else if (3  == InOutStage)
	{
			t = 2.0f*t;
			if(t<1.0f)
			{
				float a = -0.7f*t-0.3f;
				float tempT = a*a*a;
				float fade = tempT*tempT;
				float dFade = 6.0f*tempT;
				temp.x =  temp.x*fade;
				temp.y = dFade*temp.x + fade*temp.y; 
			}else
			{
				t = t - 1.0f;
				float a = t*0.7f - 1.0f;
				float tempT = a*a*a;
				float fade = tempT*tempT;
				float dFade = 6.0f*tempT;
				temp.x = temp.x*fade;
				temp.y = -dFade*temp.x + fade*temp.y; 
			}
		
	}
	return temp; 	
}


__kernel  void MAIN(
      __read_only image2d_t src_data,
      __write_only image2d_t dest_data,        //Data in global memory
	  __global FilterParam* param,
	  float frequency,
	  float pos_X,
	  float pos_Y, 
	  float rot_Z, 
	  int RGB_Separate,
	  int needMotionBlur)
{
	float process = param->cur_time / param->total_time;
	float iGlobalTime = param->cur_time;
	
	int W = get_global_size(0);
	int H = get_global_size(1);
	int w = get_global_id0( param);
	int h = get_global_id1( param);
	float2 resolution = (float2)(W,H);
	float2 iResolution = resolution;
	int2 gl_FragCoord = (int2)(get_global_id0( param), get_global_id1( param));
	vec2 fragCoord = (vec2)(get_global_id0( param), get_global_id1( param));
	vec2 uv = ((vec2)(fragCoord.x, fragCoord.y) + (vec2)(0.5f)) /resolution.xy;
		
	vec2 motion = decay( rand(frequency*0.7777f, amplitude, iGlobalTime, 0.0f), process, InOutStage);
	vec2 scaleRandX = rand(frequency*0.6666f, amplitude, iGlobalTime,1.7f);
	vec2 scaleRandY = rand(frequency*0.9f, amplitude, iGlobalTime,0.9f);

	vec2 posRandX =  decay( rand(frequency*0.833f, amplitude, iGlobalTime, 0.666f)*pos_X, process, InOutStage);
	vec2 posRandY = decay( rand(frequency*0.777f, amplitude, iGlobalTime, 0.333f)*pos_Y, process, InOutStage);

	vec2 curPos = (vec2)(posRandX.x, posRandY.x);
	float alpha = 100.0f;//range [0,100]
	float curRotate = motion.x*rot_Z;//degree
	vec4 color = (vec4)(0.0f);
	float ScaleX; 
	float ScaleY;
	float peak = 0.38628f;
	scaleRandX = (vec2)( (scaleRandX.x + peak)*(ScaleXUp + ScaleXDown)/(2.0f*peak) - ScaleXDown,  scaleRandX.y* (ScaleXUp + ScaleXDown)/(2.0f*peak) );
	scaleRandX = decay(scaleRandX, process, InOutStage );
	ScaleX = scaleRandX.x;
	
	scaleRandY =  (vec2)( (scaleRandY.x + peak)*(ScaleYUp + ScaleYDown)/(2.0f*peak) - ScaleYDown,  scaleRandY.x* (ScaleXUp + ScaleXDown)/(2.0f*peak) );
	scaleRandY = decay(scaleRandY, process, InOutStage );
	ScaleY = scaleRandY.x;

	//vec2 curScale = max( vec2(1.0 + scaleRandX.x*ScaleX, 1.0 + scaleRandY.x*ScaleY), vec2(0.01)); //range [0,1]
	vec2 curScale = max( (vec2)(1.0f + ScaleX, 1.0f + ScaleY), (vec2)(0.00001f) ); //range [0,1]
	
	int samples = 10; 
	if(needMotionBlur == 0)
		samples = 2;
	
	vec2 center = (vec2)(0.5f);
	
	float blurAmp = 0.05f;
	
	vec2 posV = (vec2)(posRandX.y, posRandY.y);
	
	vec2 dir = (vec2)(posRandX.y, posRandY.y)*motionBlur;
	
	float processRota = curRotate*0.01745329f;
	float dirRota = -(180.0f - curRotate)*0.01745329f;
	uv = uv - curPos;
	
	uv = rotateFunc(uv*iResolution.xy,iResolution.xy*center,processRota)/iResolution.xy;
	uv = scaleFunc(uv,curScale,center);//scaling

	vec2 radialDir = radialBlur*(uv - center)*(vec2)(scaleRandX.y*ScaleX,scaleRandY.y*ScaleY)*0.01f;
	float detaRotate = motion.y*rot_Z;
	
	vec2 RotateDir = rotateblur*rotateFunc(normalize(uv - center), (vec2)(0.0f), - 1.570796f) * length(uv - center)*detaRotate*0.01745329f;
	
	vec2 totalDir = radialDir+ RotateDir + dir;
	
	vec2 processColorSp = (vec2)(0.0f);
	if(RGB_Separate == 1)
		processColorSp = clamp( totalDir*0.1f, -0.02f, 0.02f );
	
	float count = 0.0f;
	vec4 outCol; 
	vec2 temp1;
	vec2 temp2;
	vec2 temp3;
	if(needMotionBlur == 1)
	{
		for (int i = 0; i < samples; i += 2) //operating at 2 samples for better performance
		{
			temp1 = (uv + (float)(i) / (float)(samples)*totalDir*blurAmp + (vec2)(0.0f));
			temp2 = (uv + (float)(i) / (float)(samples)*totalDir*blurAmp + processColorSp);
					
			color.xw += INPUT(src_data, mirror(temp1), param).xw;;
			color.yz += INPUT(src_data, mirror(temp2), param).yz;
			
			temp1 = (uv + (float)(i+1) / (float)(samples)*totalDir*blurAmp + (vec2)(0.0f));
			temp2 = (uv + (float)(i+1) / (float)(samples)*totalDir*blurAmp + processColorSp);
					
			color.xw += INPUT(src_data, mirror(temp1), param).xw;;
			color.yz += INPUT(src_data, mirror(temp2), param).yz;
		}
		outCol = color/(float)(samples);
	}else
	{
		temp1 = (uv + 1.0f / (float)(samples)*totalDir*blurAmp + (vec2)(0.0f));
		temp2 = (uv + 1.0f / (float)(samples)*totalDir*blurAmp + (vec2)(processColorSp));
		
		color.xw += INPUT(src_data, mirror(temp1), param).xw;;
		color.yz += INPUT(src_data, mirror(temp2), param).yz;

		outCol = color;
	}
	
	write_imagef(dest_data, (int2)(get_global_id(0), get_global_id(1)), outCol.zyxw);
}
